#%%
import numpy as np
import nltk
import pandas as pd
from nltk.corpus import stopwords
from sklearn.metrics import classification_report
import gc
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
from transformers import LongformerTokenizer, TFLongformerForSequenceClassification
from transformers import AdamW


#%%

# Extracting Participant ID and PHQ8_Binary from the dataset

dataset1 = np.array(pd.read_csv('../split_data/dev_split_Depression_AVEC2017.csv',delimiter=',',encoding='utf-8'))[:, 0:2]
dataset2 = np.array(pd.read_csv('../split_data/full_test_split.csv',delimiter=',',encoding='utf-8'))[:, 0:2]
dataset3 = np.array(pd.read_csv('../split_data/train_split_Depression_AVEC2017.csv',delimiter=',',encoding='utf-8'))[:, 0:2]

countPos = 0
dataset = np.concatenate((dataset1, np.concatenate((dataset2, dataset3))))
def checkPosNeg(dataset, index):
    for i in range(0, len(dataset)):
        if(dataset[i][0] == index):
            return dataset[i][1]
    return 0

Data = []
Y = []

Data_test = []
Y_test = []
index = -1

#%%

import google.cloud
from google.cloud import storage
import pandas as pd
from io import StringIO

def read_csv_blob(bucket_name, source_blob_name):
    """Reads a blob as a CSV from the bucket."""
    storage_client = storage.Client(project='lofty-psyche-405510')
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)

    # Download as string and use StringIO to convert it to a file-like object
    data = blob.download_as_text()
    csv_data = StringIO(data)
    
    return csv_data

bucket_name = "mlpr"
for i in range(0, len(dataset3)):
    val = checkPosNeg(dataset, dataset3[i][0])
    if(val == 0 and countPos>38):
        continue
    Y.append(val)
    index+=1
    if(Y[index] == 0):
        countPos+=1

    source_blob_name = "DAIC/train_data/" + str(int(dataset3[i][0])) +"_P/" + str(int(dataset3[i][0])) + "_TRANSCRIPT.csv"
    csv_stream = read_csv_blob(bucket_name, source_blob_name)
    Data.append(np.array(pd.read_csv(csv_stream,delimiter='\t',encoding='utf-8'))[:, 2:4])

for i in range(0, len(dataset1)):
    val = checkPosNeg(dataset, dataset1[i][0])
    if(val == 0):
        continue
    Y.append(val)
    source_blob_name = "DAIC/dev_data/" + str(int(dataset1[i][0])) +"_P/" + str(int(dataset1[i][0])) + "_TRANSCRIPT.csv"
    csv_stream = read_csv_blob(bucket_name, source_blob_name)
    Data.append(np.array(pd.read_csv(csv_stream,delimiter='\t',encoding='utf-8'))[:, 2:4])

for i in range(0, len(dataset2)):
    Y_test.append(checkPosNeg(dataset, dataset2[i][0]))
    source_blob_name = "DAIC/test_data/" + str(int(dataset2[i][0])) +"_P/" + str(int(dataset2[i][0])) + "_TRANSCRIPT.csv"
    csv_stream = read_csv_blob(bucket_name, source_blob_name)
    Data_test.append(np.array(pd.read_csv(csv_stream,delimiter='\t',encoding='utf-8'))[:, 2:4])

Y = np.array(Y)
Data2 = []

Data2_test = []
Y_test = np.array(Y_test)

#%%
for i in range(0, len(Data)):
    script = []
    for k in range(1, len(Data[i])):
        if(Data[i][k][0] == "Participant"):
            script.append(Data[i][k][1])
    Data2.append(script)
    
for i in range(0, len(Data_test)):
    script = []
    for k in range(1, len(Data_test[i])):
        if(Data_test[i][k][0] == "Participant"):
            script.append(Data_test[i][k][1])
    Data2_test.append(script)

train_data = Data2
train_labels = np.array(Y)
test_data = Data2_test
test_labels = np.array(Y_test)

#%%
gs_bucket_url = 'gs://mlpr/GoogleNews-vectors-negative300.bin'
stop_words = set(stopwords.words('english'))

# for k in range(len(Data2)):
# 	for i in range(min(max_num_sentence, len(Data2[k]))):
# 		try:
# 			sentence = Data2[k][i].split(" ")
# 		except:
# 			continue
# 		sentence = remove_StopWOrds(sentence)

def remove_StopWOrds(person):
    filtered_convo = ''
    for sentence in person:
        if isinstance(sentence, float):
            continue
        sentence_converted = sentence.split(" ")
        for w in sentence_converted: 
            if w not in stop_words: 
                filtered_convo+= w+' '
    return filtered_convo

new_train_data = []
new_test_data = []

for i in range(len(train_data)):
    new_train_data.append(remove_StopWOrds(train_data[i]))

for person in range(len(test_data)):
    new_test_data.append(remove_StopWOrds(test_data[person]))


Data2 = []
Data2_test = []
gc.collect()

#%%
# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Function to tokenize the transcript data
def tokenize_transcripts(data):
    input_ids = []
    attention_masks = []

    for transcript in data:
        encoded_data = tokenizer.encode_plus(
            transcript,
            add_special_tokens=True,
            max_length=512,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='tf'
        )
        # Reshape to remove the extra dimension
        input_ids.append(tf.squeeze(encoded_data['input_ids']))
        attention_masks.append(tf.squeeze(encoded_data['attention_mask']))

    return np.array(input_ids), np.array(attention_masks)

# # Initialize Longformer tokenizer and model
# tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
# longformer_model = TFLongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096', num_labels=2)

# # Function to tokenize the transcript data
# def tokenize_transcripts(data):
#     input_ids = []
#     attention_masks = []

#     for transcript in data:
#         encoded_data = tokenizer.encode_plus(
#             transcript,
#             add_special_tokens=True,
#             max_length=4096,  # Longformer's max length
#             padding='max_length',
#             truncation=True,
#             return_attention_mask=True,
#             return_tensors='tf'
#         )
#         # Reshape to remove the extra dimension
#         input_ids.append(tf.squeeze(encoded_data['input_ids']))
#         attention_masks.append(tf.squeeze(encoded_data['attention_mask']))

#     return np.array(input_ids), np.array(attention_masks)

# # Assume train_data, test_data, train_labels, and test_labels are defined
# Tokenize the transcript data
train_input_ids, train_attention_masks = tokenize_transcripts(train_data)
test_input_ids, test_attention_masks = tokenize_transcripts(test_data)

#%%

# Compile the BERT model
optimizer = Adam(learning_rate=0.01, epsilon=1e-04)
bert_model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the BERT model
bert_model.fit([train_input_ids, train_attention_masks], train_labels, epochs=10, batch_size=8)

# Make predictions on the test data
predictions = bert_model.predict([test_input_ids, test_attention_masks])

# Print classification report
print(classification_report(test_labels, np.argmax(predictions[0], axis=1), digits=4))

# # Compile the Longformer model
# optimizer = Adam(learning_rate=2e-5, epsilon=1e-08)
# longformer_model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# # Train the Longformer model
# longformer_model.fit([train_input_ids, train_attention_masks], train_labels, epochs=3, batch_size=4)  # Adjust batch size if needed

# # Make predictions on the test data
# predictions = longformer_model.predict([test_input_ids, test_attention_masks])

# # Print classification report
# print(classification_report(test_labels, np.argmax(predictions[0], axis=1), digits=4))
# %%
